# ------------------------------------------------------------------------------
# Build risk factors for full cohort
# Author: Cassidy Shubatt <cshubatt@gmail.com>
# To run: bsub -q big -R "rusage[mem=10000]" bash 01_build_risk_factors.sh
# ------------------------------------------------------------------------------

# Setup ------------------------------------------------------------------------
library(yaml) # read_yaml absolute filepaths
library(data.table)
library(here) # here() relative filepaths
library(testit) # assert function
library(tidyverse)
library(glue)

u <- modules::use(here("lib", "util.R"))

# Load Data --------------------------------------------------------------------
message("Loading data...")
paths <- read_yaml(here("lib", "filepaths.yml"))
risk_factor_dict <- read_yaml(here::here("lib", "risk_factors.yml"))
feature_names <- unlist(risk_factor_dict, use.names = FALSE)
risk_factor_df <- tibble(ed_enc_id = double())
for (x in names(risk_factor_dict)) {
  risk_factor_df[x] <- double()
}

# Construct risk factors -------------------------------------------------------
message("Constructing risk factor variables...")
split <- "random"
for (sample_split in c("val", "test", "train")) {
  message("Getting risk factors for ", split, "...")
  features_split <- readRDS(glue(paths$features[[sample_split]])) %>%
    as.matrix()
  assert(
    "Risk factor features in features df",
    all(feature_names %in% colnames(features_split))
  )
  features_split <- features_split[, (feature_names)]
  ids_split <- readRDS(paths$cohort[[sample_split]]) %>%
    select(ed_enc_id) %>%
    setDT()
  features_split <- features_split > 0
  for (x in names(risk_factor_dict)) {
    ids_split[, (x) := rowSums(features_split[, risk_factor_dict[[x]]]) > 0]
  }

  risk_factor_df <- rbind(risk_factor_df, ids_split)
}

# Print means ------------------------------------------------------------------
message("Getting risk factor rates in full population...")
for (risk in names(risk_factor_dict)) {
  risk_rate <- mean(risk_factor_df[[risk]])
  message(risk, " rate: ", risk_rate)
}

# Save -------------------------------------------------------------------------
message("Saving risk factors to ", paths$analysis$risk_factors, "...")
write_rds(risk_factor_df, paths$analysis$risk_factors)

message("Done.")
